New York City Taxi and Limousine Commission data for Green Taxis This DS challenge is designed to evaluate your skills and intuition regarding a real world data problem. Data set: New York City Taxi and Limousine Commission trip records https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
The yellow and green taxi trip records include fields capturing pick-up and drop-off dates/times, pick-up and drop-off locations, trip distances, itemized fares, rate types, payment types, and driver-reported passenger counts.
We'll use data from Green Taxis for September 2015.
Load data and analyze:
Please submit the result in the form of runnable notebooks or scripts. A link to GitHub or other code repository would be great. Please let us know if we need to do anything special to run your notebook (install packages, get extra data etc.)
link="https://drive.google.com/open?id=1CFzqt-Ot1WPkW9pOGg_bYk4B7NJiXeGI"
_,id=link.split("=")
import warnings
warnings.filterwarnings('ignore')
!pip install geopy
!pip install wget
!pip install geopandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import requests
import datetime as dt
import dask.dataframe as dask_dataframe
import dask.distributed
import scipy
import geopandas
import wget
from geopy.distance import vincenty
from shapely.geometry import Point
from sklearn import metrics
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import RandomizedSearchCV, train_test_split
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('data_after_dask.csv')
data = pd.read_csv('data_after_dask.csv')
data.head()
data.describe()
distance_error = np.array(data["Trip_distance"])-np.array(data["distance_calculated"])
scipy.stats.describe(distance_error)
url=r'https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv'
taxi_zone = pd.read_csv(url)
pick_zone = taxi_zone.copy()
drop_zone = taxi_zone.copy()
print(pick_zone.columns)
pick_zone.columns = ['pickup_locationID','pickup_borough','pickup_zonename','pickup_servicezone']
drop_zone.columns = ['dropoff_locationID','dropoff_borough','dropoff_zonename','dropoff_servicezone']
pick_zone.head()
drop_zone.head()
result = pd.merge(data,pick_zone,left_on='pickup_zone', right_on='pickup_locationID',how='left')
result.head()
result = pd.merge(result,drop_zone,left_on='dropoff_zone', right_on='dropoff_locationID',how='left')
result.head()
result.drop(columns=['pickup_locationID','dropoff_locationID'],inplace=True)
result.head()
result["pickup_servicezone"].value_counts()
result["pickup_borough"].value_counts()
def get_hours(result,column_name):
"""
Getting the hour of the day
Input:
result: DataFrame
Column_name: column from which hours have to be extracted
Output:
Pandas series of the hours
"""
return result[column_name].dt.hour
def get_day(result,column_name):
"""
Getting the weekday of the day
Input:
result: DataFrame
Column_name: column from which weekday have to be extracted
Output:
Pandas series of the weekday
"""
return result[column_name].dt.weekday_name
result['pickup_date'] = pd.to_datetime(result['pickup_date'], errors='coerce')
result['dropoff_date'] = pd.to_datetime(result['dropoff_date'], errors='coerce')
result['pickup_hr'] = get_hours(result,'pickup_date')
result['dropoff_hr'] = get_hours(result,'dropoff_date')
result['pickup_day'] = get_day(result,'pickup_date')
result['dropoff_day'] = get_day(result,'dropoff_date')
result.head()
After getting to understand the description of the data. There are couple imperfection in data quality
We will have to clean the data based on which task we are accomplishing
data = result.copy()
condition = data["dropoff_zone"].isnull() | data["pickup_zone"].isnull()
print("Data having pickup zone or drop zone outside New York:",data[condition].shape)
data = data[~condition]
print("Remaining Data:",data.shape)
condition = data["Passenger_count"].isnull() | data["Passenger_count"]<=0
print("Data with wrong number of passendger:",data[condition].shape)
data = data[~condition]
print("Remaining Data:",data.shape)
condition = (data['Fare_amount']<2.5)|(data['Extra']<0)|(data['MTA_tax']<0)|(data['Tip_amount']<0)|(data['Tolls_amount']<0)|(data['improvement_surcharge']<0)|(data['Total_amount']<2.5)
print("Data with wrong amounts:",data[condition].shape)
data = data[~condition]
print("Remaining Data:",data.shape)
data['trip_time'] = (data['dropoff_date'] - data['pickup_date']).astype('timedelta64[m]')
condition = (data['trip_time']<=0) | (data['trip_time']>200)
print("Data with wrong trip durations:",data[condition].shape)
data = data[~condition]
print("Remaining Data:",data.shape)
#Speed
data['trip_speed'] = data['Trip_distance']*60 / data['trip_time']
#Tip percentage of Fare amount
data['Tip_percent_amount']=data['Tip_amount']*100/data['Fare_amount']
#Tip percentage of trip duration
data['Tip_percent_time']=data['Tip_amount']*100/data['trip_time']
condition = (data['Tip_percent_amount']>50)
print("Data with wrong amounts:",data[condition].shape)
data = data[~condition]
print("Remaining Data:",data.shape)
condition = (data['trip_speed']>150) | (data['trip_speed']<0)
print("Data with wrong amounts:",data[condition].shape)
data = data[~condition]
print("Remaining Data:",data.shape)
plt.hist(data['Trip_distance'], 50, facecolor='blue', alpha=0.8)
plt.xlabel('Trip Distance in miles')
plt.ylabel('Probability')
plt.title(r'Probability density of Trip distance')
plt.show()
from scipy.stats import lognorm
fig,ax = plt.subplots(1,3,figsize = (20,5))
dist = data['Trip_distance']
ax[0].hist(dist,bins=range(0,120,1),facecolor='green', alpha=0.8)
ax[1].hist(dist,bins=range(0,120,1),facecolor='green', alpha=0.8)
ax[2].hist(dist[~((dist-dist.median()).abs()>4*dist.std())],bins=range(0,16,1),facecolor='green', alpha=0.8)
ax[0].set_xlabel('Trip Distance')
ax[0].set_ylabel('Number of trips (Count - Scale)')
ax[0].set_title('Histogram - Trip Distance')
ax[1].set_xlabel('Trip Distance')
ax[1].set_ylabel('Number of trips (log - Scale)')
ax[1].set_yscale('log')
ax[1].set_title('Histogram - Trip Distance with log scale')
ax[2].set_xlabel('Trip Distance')
ax[2].set_ylabel('Number of trips')
ax[2].set_title('Histogram - Trip Distance without outliers')
# Fit line
scatter,loc,mean = lognorm.fit(data["Trip_distance"].values,scale=data.Trip_distance.mean(),loc=0)
pdf_fitted = lognorm.pdf(np.arange(0,12,.2),scatter,loc,mean)
ax[2].plot(np.arange(0,12,.2),1400000*pdf_fitted,'r')
ax[2].legend(['data','lognormal fit'])
plt.show()
The Trip Distance is positively asymmetrically skewed distributed. The skewness is due to high concentration of the rides towards lower bound(=0). This is a structure of a lognormal distribution.
The trips are not random becuase its not normal distribution.
def pickup_groupby(data):
"""
Groupby the data by column='pickup_zone'. Operator: Count
Input:
data: DataFrame, Taxi data after joined with taxi zone data
Output:
DataFrame with index='pickup_zone'
"""
count_pickup_zone = data.groupby('pickup_zone').count()
count_pickup_zone.columns=["count_pick_zone"]
return count_pickup_zone
def dropoff_groupby(data):
"""
Groupby the data by column='dropoff_zone'. Operator: Count
Input:
data: DataFrame, Taxi data after joined with taxi zone data
Output:
DataFrame with index='dropoff_zone'
"""
count_dropoff_zone = data.groupby('dropoff_zone').count()
count_dropoff_zone.columns=["count_dropoff_zone"]
return count_dropoff_zone
import bokeh, bokeh.plotting, bokeh.models
from bokeh.io import output_notebook, show, output_file, curdoc
from bokeh.layouts import row, column
from bokeh.models import ColumnDataSource, Slider
from bokeh.plotting import figure
output_notebook()
def bokeh_map(df):
"""
Show New york city with Heat map
Input:
data: DataFrame
Output:
show()
"""
gjds = bokeh.models.GeoJSONDataSource(geojson=df.to_json())
tool_list = ['pan','wheel_zoom','reset','hover','lasso_select', 'tap']
pickup = bokeh.plotting.figure(title="NYC Green Taxi Pickup", tools=tool_list,plot_width=500, plot_height=500)
dropoff = bokeh.plotting.figure(title="NYC Green taxi Dropoff", tools=tool_list,plot_width=500, plot_height=500)
blue = bokeh.models.LogColorMapper(palette=bokeh.palettes.Blues9[::-1],low=1, high=5000)
green = bokeh.models.LogColorMapper(palette=bokeh.palettes.Greens9[::-1],low=1, high=5000)
pickup.patches('xs', 'ys', fill_color={'field': 'count_pick_zone', 'transform': blue},
fill_alpha=1, line_color="gray",source=gjds)
dropoff.patches('xs', 'ys', fill_color={'field': 'count_dropoff_zone', 'transform': green},
fill_alpha=1, line_color="gray",source=gjds)
pickup.grid.grid_line_color = None
dropoff.grid.grid_line_color = None
hover_pickup = pickup.select_one(bokeh.models.HoverTool)
hover_pickup.point_policy = "follow_mouse"
hover_pickup.tooltips = u"""
<div>
<div class="bokeh_hover_tooltip">Name : @zone</div>
<div class="bokeh_hover_tooltip">Trips Start : @count_pick_zone</div>
</div>
"""
hover_dropoff = dropoff.select_one(bokeh.models.HoverTool)
hover_dropoff.point_policy = "follow_mouse"
hover_dropoff.tooltips = u"""
<div>
<div class="bokeh_hover_tooltip">Name : @zone</div>
<div class="bokeh_hover_tooltip">Trips Start : @count_dropoff_zone</div>
</div>
"""
blue_bar = bokeh.models.ColorBar(color_mapper=blue,
orientation='horizontal',
ticker=bokeh.models.FixedTicker(ticks=[5, 10, 50, 100, 500, 1000, 5000, 10000]),
formatter=bokeh.models.PrintfTickFormatter(format='%d'),
label_standoff=12,
border_line_color=None,
location=(0,0))
pickup.add_layout(blue_bar, 'above')
green_bar = bokeh.models.ColorBar(color_mapper=green,
orientation='horizontal',
ticker=bokeh.models.FixedTicker(ticks=[5, 10, 50, 100, 500, 1000, 5000, 10000]),
formatter=bokeh.models.PrintfTickFormatter(format='%d'),
label_standoff=12,
border_line_color=None,
location=(0,0))
dropoff.add_layout(green_bar, 'above')
p = row(pickup, dropoff)
show(p)
This cell below loads shape file (compatible with geopandas) to chart different taxi zones on Bokeh map
The link below is the hyperlink of the data available on the NYC Taxi website [https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page]
import ssl
import zipfile
ssl._create_default_https_context = ssl._create_unverified_context
wget.download('https://archive.nyu.edu/bitstream/2451/36743/3/nyu_2451_36743_WGS84.zip')
zipfile.ZipFile('nyu_2451_36743_WGS84.zip', 'r').extractall()
df_shape = geopandas.read_file('nyu_2451_36743_WGS84/nyu_2451_36743.shp').to_crs({'init': 'epsg:4326'})
df_shape = df_shape.drop(['Shape_Area', 'Shape_Leng', 'OBJECTID'], axis=1)
def generate_bokeh_data(df_shape,data):
bokeh_data = df_shape.merge(pickup_groupby(data[["pickup_zone",'dropoff_zone']]),
left_on='LocationID',
right_index=True,
how='left')
bokeh_data = bokeh_data.merge(dropoff_groupby(data[["pickup_zone",'dropoff_zone']]),
left_on='LocationID',
right_index=True,
how='left')
return bokeh_data
generate_bokeh_data(df_shape,data).head()
bokeh_map(generate_bokeh_data(df_shape,data))
The Pickups are not evenly distributed. they are mostly concentrated on the regions touching the East River i.e. boundary between Manhattan and Queens or Brooklyn. There are not pickups in the Downtown Manhattan regions because of the regional aggrement between Green and Yello Taxi associations.
# Mean / Median for hourly study
def hourly_distance(data):
hr_dist = pd.DataFrame(data[['Trip_distance','pickup_hr']].groupby(by='pickup_hr').mean())
hr_dist['Medium_trip_distance']=data[['Trip_distance','pickup_hr']].groupby(by='pickup_hr').median()
hr_dist.columns=['Mean_trip_distance','Median_trip_distance']
return hr_dist
plt.plot(hourly_distance(data));
plt.xlabel('Pickup hour')
plt.ylabel('Trip distance (miles)')
plt.title('Mean and median distance')
plt.legend(['Mean distance', 'Median distance'])
The average trip distance is very high during early morning and late nights. This reflects the lifestyle of NYC.
data['pickup_borough'].unique()
fig,ax = plt.subplots(2,3,figsize = (20,10),sharex=True)
fig.suptitle('Mean and median distance')
borough = data['pickup_borough'].unique()
for i in range(len(borough)):
ax[i//3,i%3].plot(hourly_distance(data[data['pickup_borough']==borough[i]]))
ax[i//3,i%3].set_title(borough[i])
ax[i//3,i%3].set_xlabel('Pickup hour')
ax[i//3,i%3].set_ylabel('Trip distance (miles)')
plt.show()
The main 4 boroughs show similar reuslt as of average NYC graph. Average ride distance for the pickps in Manhattan and Brooklyn is little higher than Bronx and Queens. Rides in Staten Island are realtively randomised, moreover we don't have large to comment something concrete on Staten Island and EWR
pick_count_borough = data[['pickup_hr','pickup_borough','pickup_zonename']].groupby(['pickup_borough','pickup_hr']).count()
pick_count_borough.loc['Bronx']
labels = borough
width = 0.10 # the width of the bars
fig, ax = plt.subplots(1,1,figsize = (20,5))
for i in range(len(labels)):
x = pick_count_borough.loc[labels[i]].index
ax.bar(x + width*i*1.4,pick_count_borough.loc[labels[i]].pickup_zonename, width, label=labels[i])
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Count of Taxi rides')
ax.set_xlabel('Pickup hour after midnight')
ax.set_title('Number of Taxi rides')
ax.set_xticks(np.arange(0, 24, step=1))
ax.legend()
plt.show()
This graph offer interesting insghts about Taxi usage in different boroughs.
tip_amount_borough = data[['pickup_hr','pickup_borough','Tip_percent_amount']].groupby(['pickup_borough','pickup_hr']).mean()
tip_amount_borough.loc['Bronx']
labels = borough
width = 0.10 # the width of the bars
fig, ax = plt.subplots(1,1,figsize = (20,5))
for i in range(len(labels)):
x = tip_amount_borough.loc[labels[i]].index
ax.bar(x + width*i*1.4,tip_amount_borough.loc[labels[i]].Tip_percent_amount, width, label=labels[i])
ax.set_ylabel('Tip/Fare amount - %')
ax.set_xlabel('Pickup hour after midnight')
ax.set_title('Tip percentage of Fare amount')
ax.set_xticks(np.arange(0, 24, step=1))
ax.set_ylim(top=20)
ax.legend()
plt.show()
General trend coming out is trips originating from the Brooklyn tends to pay higher tip percentage of the base fare amount. Next is Manhattan.
tip_time_borough = data[['pickup_hr','pickup_borough','Tip_percent_time']].groupby(['pickup_borough','pickup_hr']).mean()
tip_time_borough.loc['Bronx']
labels = borough
width = 0.10 # the width of the bars
fig, ax = plt.subplots(1,1,figsize = (20,5))
for i in range(len(labels)):
x = tip_time_borough.loc[labels[i]].index
ax.bar(x + width*i*1.4,tip_time_borough.loc[labels[i]].Tip_percent_time, width, label=labels[i])
ax.set_ylabel('Tip/(Time in minutes) - %')
ax.set_xlabel('Pickup hour after midnight')
ax.set_title('Tip percentage of Trip duration')
ax.set_xticks(np.arange(0, 24, step=1))
ax.set_ylim(top=20)
ax.legend()
plt.show()
tip_ride_borough = data[['pickup_hr','pickup_borough','Tip_amount']].groupby(['pickup_borough','pickup_hr']).mean()
tip_ride_borough.loc['Bronx']
labels = borough
width = 0.10 # the width of the bars
fig, ax = plt.subplots(1,1,figsize = (20,5))
for i in range(len(labels)):
x = tip_ride_borough.loc[labels[i]].index
ax.bar(x + width*i*1.4,tip_ride_borough.loc[labels[i]].Tip_amount, width, label=labels[i])
ax.set_ylabel('Tip amount - $')
ax.set_xlabel('Pickup hour after midnight')
ax.set_title('Average Tip amount')
ax.set_xticks(np.arange(0, 24, step=1))
ax.set_ylim(top=4)
ax.legend()
plt.show()
Last 2 graphs also comfirms the bias that trips starting from Brooklyn have higher upside for the taxi drivers
data.columns
fig,ax = plt.subplots(3,1,figsize = (20,15))
fig.suptitle('Variation of Tip with trip duration')
temp_df = data[['trip_time','Tip_percent_amount','Tip_percent_time','Tip_amount']]
temp_df['trip_time'] = pd.cut(temp_df['trip_time'], 50,labels=range(50))
temp_df = temp_df.groupby('trip_time').mean()
ax[0].bar(temp_df.index,temp_df['Tip_percent_amount'],color='orange')
ax[0].set_xlabel('Trip Duration - Max: 200 minutes')
ax[0].set_xticks(np.arange(0, 50, step=1))
ax[0].set_ylabel('Trip percentage of Fare amount')
ax[1].bar(temp_df.index,temp_df['Tip_percent_time'],color='blue')
ax[1].set_xlabel('Trip Duration - Max: 200 minutes')
ax[1].set_xticks(np.arange(0, 50, step=1))
ax[1].set_ylabel('Tip percentage of trip duration')
ax[2].bar(temp_df.index,temp_df['Tip_amount'],color='green')
ax[2].set_xlabel('Trip Duration - Max: 200 minutes')
ax[2].set_xticks(np.arange(0, 50, step=1))
ax[2].set_ylabel('Tip amount - $')
plt.show()
The above graphs depicts that trips with trip ride o between 30-50 mins will fetch the highest returns when calculated in term per distance or per time basis. This is very crucial analysis for the drivers
fig,ax = plt.subplots(1,3,figsize = (20,5))
fig.suptitle('Variation of Tip with numbr of passengers')
temp_df = data[['Passenger_count','Tip_percent_amount','Tip_percent_time','Tip_amount']]
temp_df = temp_df.groupby('Passenger_count').mean()
ax[0].bar(temp_df.index,temp_df['Tip_percent_amount'],color='orange')
ax[0].set_xlabel('Number of passengers')
ax[0].set_ylim(top=10)
ax[0].set_ylabel('Trip percentage of Fare amount')
ax[1].bar(temp_df.index,temp_df['Tip_percent_time'],color='blue')
ax[1].set_xlabel('Number of passengers')
ax[1].set_ylim(top=15)
ax[1].set_ylabel('Tip percentage of trip duration')
ax[2].bar(temp_df.index,temp_df['Tip_amount'],color='green')
ax[2].set_xlabel('Number of passengers')
ax[2].set_ylabel('Tip amount - $')
plt.show()
def pickup_groupby_mean(data,column_name):
count_pickup_zone = data.groupby('pickup_zone').mean()
return count_pickup_zone[[column_name]]
def generate_bokeh_data_mean(df_shape,data,column_name):
bokeh_data = df_shape.merge(pickup_groupby_mean(data,column_name),
left_on='LocationID',
right_index=True,
how='left')
return bokeh_data
import bokeh, bokeh.plotting, bokeh.models
from bokeh.io import output_notebook, show, output_file, curdoc
from bokeh.layouts import row, column
from bokeh.models import ColumnDataSource, Slider
from bokeh.plotting import figure
output_notebook()
def bokeh_map_single1(df,column_name):
gjds = bokeh.models.GeoJSONDataSource(geojson=df.to_json())
tool_list = ['pan','wheel_zoom','reset','hover','lasso_select', 'tap']
pickup = bokeh.plotting.figure(title="NYC Green Taxi - Tip_percent_amount", tools=tool_list,plot_width=500, plot_height=500)
blue = bokeh.models.LogColorMapper(palette=bokeh.palettes.Blues9[::-1],low=1, high=20)
pickup.patches('xs', 'ys', fill_color={'field': 'Tip_percent_amount', 'transform': blue},
fill_alpha=1, line_color="gray",source=gjds)
pickup.grid.grid_line_color = None
hover_pickup = pickup.select_one(bokeh.models.HoverTool)
hover_pickup.point_policy = "follow_mouse"
hover_pickup.tooltips = u"""
<div>
<div class="bokeh_hover_tooltip">Name : @zone</div>
<div class="bokeh_hover_tooltip">Trips Start : @Tip_percent_amount</div>
</div>
"""
blue_bar = bokeh.models.ColorBar(color_mapper=blue,
orientation='horizontal',
ticker=bokeh.models.FixedTicker(ticks=[0.1,0.5,1,5,10]),
formatter=bokeh.models.PrintfTickFormatter(format='%d'),
label_standoff=12,
border_line_color=None,
location=(0,0))
pickup.add_layout(blue_bar, 'above')
p = row(pickup)
show(p)
bokeh_map_single1(generate_bokeh_data_mean(df_shape,data,'Tip_percent_amount'),'Tip_percent_amount')
Upper Manhattan and Brooklyn riders offer higher tip
import bokeh, bokeh.plotting, bokeh.models
from bokeh.io import output_notebook, show, output_file, curdoc
from bokeh.layouts import row, column
from bokeh.models import ColumnDataSource, Slider
from bokeh.plotting import figure
output_notebook()
def bokeh_map_single2(df,column_name):
gjds = bokeh.models.GeoJSONDataSource(geojson=df.to_json())
tool_list = ['pan','wheel_zoom','reset','hover','lasso_select', 'tap']
pickup = bokeh.plotting.figure(title="NYC Green Taxi - Tip_percent_time", tools=tool_list,plot_width=500, plot_height=500)
blue = bokeh.models.LogColorMapper(palette=bokeh.palettes.Greens9[::-1],low=1, high=20)
pickup.patches('xs', 'ys', fill_color={'field': 'Tip_percent_time', 'transform': blue},
fill_alpha=1, line_color="gray",source=gjds)
pickup.grid.grid_line_color = None
hover_pickup = pickup.select_one(bokeh.models.HoverTool)
hover_pickup.point_policy = "follow_mouse"
hover_pickup.tooltips = u"""
<div>
<div class="bokeh_hover_tooltip">Name : @zone</div>
<div class="bokeh_hover_tooltip">Trips Start : @Tip_percent_time</div>
</div>
"""
blue_bar = bokeh.models.ColorBar(color_mapper=blue,
orientation='horizontal',
ticker=bokeh.models.FixedTicker(ticks=[0.1,0.5,1,5]),
formatter=bokeh.models.PrintfTickFormatter(format='%d'),
label_standoff=12,
border_line_color=None,
location=(0,0))
pickup.add_layout(blue_bar, 'above')
p = row(pickup)
show(p)
bokeh_map_single2(generate_bokeh_data_mean(df_shape,data,'Tip_percent_time'),'Tip_percent_time')
Clearly Airports rides offer disproportionaly high tips when compared along with trip durations.
import bokeh, bokeh.plotting, bokeh.models
from bokeh.io import output_notebook, show, output_file, curdoc
from bokeh.layouts import row, column
from bokeh.models import ColumnDataSource, Slider
from bokeh.plotting import figure
output_notebook()
def bokeh_map_single3(df,column_name):
gjds = bokeh.models.GeoJSONDataSource(geojson=df.to_json())
tool_list = ['pan','wheel_zoom','reset','hover','lasso_select', 'tap']
pickup = bokeh.plotting.figure(title="NYC Green Taxi - Tip amount", tools=tool_list,plot_width=500, plot_height=500)
blue = bokeh.models.LogColorMapper(palette=bokeh.palettes.Oranges9[::-1],low=1, high=2)
pickup.patches('xs', 'ys', fill_color={'field': 'Tip_amount', 'transform': blue},
fill_alpha=1, line_color="gray",source=gjds)
pickup.grid.grid_line_color = None
hover_pickup = pickup.select_one(bokeh.models.HoverTool)
hover_pickup.point_policy = "follow_mouse"
hover_pickup.tooltips = u"""
<div>
<div class="bokeh_hover_tooltip">Name : @zone</div>
<div class="bokeh_hover_tooltip">Trips Start : @Tip_amount</div>
</div>
"""
blue_bar = bokeh.models.ColorBar(color_mapper=blue,
orientation='horizontal',
ticker=bokeh.models.FixedTicker(ticks=[0.1,0.5,1,2,5]),
formatter=bokeh.models.PrintfTickFormatter(format='%d'),
label_standoff=12,
border_line_color=None,
location=(0,0))
pickup.add_layout(blue_bar, 'above')
p = row(pickup)
show(p)
bokeh_map_single3(generate_bokeh_data_mean(df_shape,data,'Tip_amount'),'Tip_amount')
data.columns
## rate code = 2 or 3
## pickup or dropoff location in JFK, LGA, or EWR
condition = (data['RateCodeID'].isin([2,3]))| (data['pickup_zonename'].isin(['JFK','LGA','EWR'])) | (data['dropoff_zonename'].isin(['JFK','LGA','EWR']))
#calculate airport and non airport trips number
data['airport_trip'] = 'Standard'
data['airport_trip'].loc[condition] = 'Airport'
print('Non - Airport Trips:', data['airport_trip'].value_counts()[0],'Airport Trips',data['airport_trip'].value_counts()[1])
airport_fare=data[['Total_amount','airport_trip']].groupby(by=['airport_trip']).mean()
airport_fare.columns=['mean_total_amount']
airport_fare['standard_deviation']=data[['Total_amount','airport_trip']].groupby(by=['airport_trip']).std()
airport_fare
Basic level of cleaning is already done. Here I will examine each attribute and its logical sense
data.columns
#No need of vendorID
data.drop(columns=['VendorID'],inplace=True)
#RateCodeID
data['RateCodeID'].value_counts()
Most of the rides are with Standard Fares (~98%)
Next: Latitudes and longitudes are already analysed while performing visualization
data['Passenger_count'].value_counts()
Field with zero passenger counts already removed in the previous analysis
data['Payment_type'].value_counts()
data[data['Trip_distance']==0].shape
condition = (data["Trip_distance"]<0.2) & (data["distance_calculated"]<0.2)
print("Data having small trip distance:",data[condition].shape)
data = data[~condition]
print("Remaining Data:",data.shape)
data['Tip_amount'].describe()
Since the tip amount data is not available for the Cash payment. We will run the model on only Credit Card payment. However, this the model should also work on cash payments trips. This is becuase the distributions of other variables for payment=Credit and payment=cash are identical and of almost similar size
tip_condition = data['Payment_type']==1
tip_data = data[tip_condition]
tip_data.shape
For Taxi Driver, Lets brain storm what are parameters He/She can judge for a trip beforehand to maximize the trip
tip_data['time_range']=pd.Series(pd.cut(tip_data['trip_time'],
[0,15,30,60,200],
labels=['15 minutes','30 minutes','60 minutes','60+ minutes'],
include_lowest=True,
right=False,
retbins=False))
# Uber/Lyft ofer driver 3-hour intervals slot. That means most of the drivers prefer 3 hour slots
labels=['0-2','3-5','6-8','9-11','12-14','15-17','18-20','21-23']
tip_data['pickuphour_range']=pd.Series(pd.cut(tip_data['pickup_hr'], list(np.arange(0,25,3)), labels=labels,right=False))
data.drop(columns=['Store_and_fwd_flag','Ehail_fee'],inplace=True)
tip_data.drop(columns=['Store_and_fwd_flag','Ehail_fee'],inplace=True)
tip_data.columns
fig, ax = plt.subplots(2,3,figsize=(20,10), sharey=True)
fig.subplots_adjust(hspace=.4)
cont_list =['pickup_borough','trip_time', 'pickup_hr', 'Passenger_count', 'Fare_amount', 'time_range']
for i, variable in enumerate(cont_list):
graph_data=tip_data.sample(500)
ax[i//3][(i+3) % 3].scatter(x=graph_data[variable], y=graph_data['Tip_percent_amount'],color='purple', alpha=.4)
ax[i//3][(i+3) % 3].set(title=variable, ylabel='Tip - % of the amount');
fig, ax = plt.subplots(2,3,figsize=(20,10), sharey=True)
fig.subplots_adjust(hspace=.4)
cont_list =['pickup_borough','trip_time', 'pickup_hr', 'Passenger_count', 'Fare_amount', 'time_range']
for i, variable in enumerate(cont_list):
graph_data=tip_data.sample(500)
ax[i//3][(i+3) % 3].scatter(x=graph_data[variable], y=graph_data['Tip_percent_time'],color='Orange', alpha=.4)
ax[i//3][(i+3) % 3].set(title=variable, ylabel='Tip - % of the amount');
The first question is what parameter is most important for the drivers From the different revolts from Uber/Lyft driver, many times they complain about not getting enough paid for the hours they put in.
Hence the intial hypothesis was there are 2 metric driver are concerned about
The above 12 graphs suggest that, Tip are very likely related with fare amount rather than Tip per unit time. Clearly frund om graphs with Trip time, Pickup hour and fare amount vagues suggest that tipping culture is around 20% of the base fare
Hence we apply the model on Tip % amount to understand the tipping patterns
tip_data['pickup_servicezone'].value_counts()
tip_data.head(5)
# Import necessary libraries to build model
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse
from statsmodels.formula.api import ols
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
import sklearn.model_selection
import sklearn.metrics
from sklearn_pandas import DataFrameMapper
tip_data_wo_dummy = tip_data.copy()
# create dummy variables
tip_data = pd.get_dummies(tip_data, columns=['Passenger_count','pickup_borough','pickup_servicezone',
'dropoff_borough','dropoff_servicezone','pickup_day','airport_trip',
'time_range','pickuphour_range'])
tip_data.columns
mapper = DataFrameMapper([(['Fare_amount', 'Extra','Tolls_amount', 'MTA_tax','improvement_surcharge', 'trip_speed','Trip_distance'],[StandardScaler()]),
(['Passenger_count_1',
'Passenger_count_2',
'Passenger_count_3',
'Passenger_count_4',
'Passenger_count_5',
'Passenger_count_6',
'Passenger_count_7',
'Passenger_count_8',
'Passenger_count_9',
'pickup_borough_Bronx',
'pickup_borough_Brooklyn',
'pickup_borough_Manhattan',
'pickup_borough_Queens',
'pickup_borough_Staten Island',
'pickup_servicezone_Airports',
'pickup_servicezone_Boro Zone',
'pickup_servicezone_Yellow Zone',
'dropoff_borough_Bronx',
'dropoff_borough_Brooklyn',
'dropoff_borough_EWR',
'dropoff_borough_Manhattan',
'dropoff_borough_Queens',
'dropoff_borough_Staten Island',
'dropoff_servicezone_Airports',
'dropoff_servicezone_Boro Zone',
'dropoff_servicezone_EWR',
'dropoff_servicezone_Yellow Zone',
'pickup_day_Friday',
'pickup_day_Monday',
'pickup_day_Saturday',
'pickup_day_Sunday',
'pickup_day_Thursday',
'pickup_day_Tuesday',
'pickup_day_Wednesday',
'airport_trip_Airport',
'airport_trip_Standard',
'time_range_15 minutes',
'time_range_30 minutes',
'time_range_60 minutes',
'time_range_60+ minutes',
'pickuphour_range_0-2',
'pickuphour_range_3-5',
'pickuphour_range_6-8',
'pickuphour_range_9-11',
'pickuphour_range_12-14',
'pickuphour_range_15-17',
'pickuphour_range_18-20',
'pickuphour_range_21-23'], None)])
X_train, X_test, y_train, y_test = train_test_split(mapper.fit_transform(tip_data), tip_data['Tip_percent_amount'], test_size=0.2, random_state=42)
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
predictions = model.predict(sm.add_constant(X_test))
model.summary()
from sklearn.metrics import explained_variance_score
print('RMSE=',np.sqrt(mse(predictions, y_test)))
print('R-square=',explained_variance_score(y_test, predictions))
The linear regression model suggest the following insights
1. Fare_amount
2. Extra
3. Tolls_amount
4. MTA_tax
5. improvement_surcharge
6. trip_speed
7. Trip_distance
from sklearn.linear_model import Lasso,LassoCV
lasso_model = LassoCV(alphas=[0.01,0.1,0.5,1,5,10,50,100]) #Regularization penalty
lasso_model.fit(X_train, y_train)
lasso_train_score = lasso_model.score(X_train,y_train)
lasso_test_score = lasso_model.score(X_test, y_test)
print('Train coefficient R^2=',lasso_train_score)
print('Test coefficient R^2=',lasso_test_score)
lasso_model.coef_
The result from the regulaized linear regression also suggest similar insights as of linear regression
import xgboost
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
rf = RandomForestRegressor(n_estimators = 50, random_state = 42)
rf.fit(X_train, y_train);
predictions = rf.predict(X_test)
errors = abs(predictions - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2))
print('Explained Variance:',explained_variance_score(predictions,y_test))
X_train, X_test, y_train, y_test = train_test_split(tip_data_wo_dummy, tip_data['Tip_percent_amount'], test_size=1, random_state=42)
predictions = [20]*y_test.shape[0]
print('RMSE=',np.sqrt(mse(predictions, y_test)))
print('Mean Absolute Error:', round(np.mean(abs(predictions - y_test)), 2))
The heuristic 20% Tip precentage predict does better than all 3 models.